Pygraphistry Viz


In [1]:
# Imports
import graphistry
import numpy as np
import pandas as pd
from py2neo import Graph, Path

graphistry.register(key='48a82a78fdd442482cec24fe06051c905e2a382d581852a4ba645927c736acbcfe7256e22873a5c97cff6b8bd37c836b')

============================


In [41]:
# Static - Connect to the database
# graph = Graph('http://neo4j:nakama@ec2-34-212-133-23.us-west-2.compute.amazonaws.com:7474')

In [42]:
# tx = graph.cypher.begin()
# for name in ["Alice", "Bob", "Carol"]:
#     tx.append("CREATE (person:Person {name:{name}}) RETURN person", name=name)
# alice, bob, carol = [result.one for result in tx.commit()]

# friends = Path(alice, "KNOWS", bob, "KNOWS", carol)
# graph.create(friends)

In [43]:
# graph.data("MATCH (a:address) --> (b:incoming_payment) --> (c:transaction) RETURN  LIMIT 25")

In [44]:
# rows = pandas.read_csv('transactions.csv')[:1000]
# graphistry.hypergraph(rows)['graph'].plot()

In [45]:
# Retrieve all the paper metadata
# btc_metadata = pd.read_sql_query('SELECT * FROM Papers', conn)

# df = pd.DataFrame(graph.data("MATCH (n:transaction) Return n LIMIT 25"))

df.head()

Retrieve citations data

citations = pd.read_csv('citations.txt', names = ['source', 'target', 'label'])

Dedupe Citations

citations = citations.drop_duplicates(subset=['source', 'target'])

Clean Citations IDs

citations['target'] = citations['target'].str.strip('.') citations['source'] = citations['source'].astype(str).str.strip('.')

Unique subjects

subjects = arxiv_metadata.primary_subject.unique() subject_colors = dict(zip(subjects, range(0, len(subjects)))) arxiv_metadata['color'] = arxiv_metadata.primary_subject.map(lambda x: subject_colors[x])

citations.info()

metadata_merge = citations.merge(arxiv_metadata, left_on='source', right_on='id').merge(arxiv_metadata, left_on='target', right_on='id', suffixes=('_from', '_to'))

metadata_merge.info()

citations = pd.read_csv('Projects/ArXiv/data/citations/citations.txt', names = ['source', 'target', 'label'])

# links = pd.read_csv('./lesmiserables.csv')

citations.head()

Set up the plotter

plotter = graphistry.bind(source="source", destination="target")

plotter.plot(citations)

citations["label"] = citations.value.map(lambda v: "#Meetings: %d" % v)

plotter = plotter.bind(edge_weight="label")

plotter.plot(citations)

Set up igraph for easy metadata etc

ig = plotter.pandas2igraph(citations)

ig = plotter.pandas2igraph(metadata_merge)

Add the Arxiv Metadata

vertex_metadata = pd.DataFrame(ig.vs['nodeid'], columns=['id']).merge(arxiv_metadata, how='left', on='id') ig.vs['primary_subject'] = vertex_metadata['primary_subject'] ig.vs['color'] = vertex_metadata['color'] ig.vs['title'] = vertex_metadata['title'] ig.vs['year'] = vertex_metadata['year'] ig.vs['month'] = vertex_metadata['month'] ig.vs['category'] = vertex_metadata['category']

ig.vs['pagerank'] = ig.pagerank()

ig.vs['community'] = ig.community_infomap().membership

ig.vs['in_degree'] = ig.indegree() plotter.bind(point_size='in_degree', point_color='color').plot(ig)

plotter.bind(point_color='community', point_size='pagerank').plot(ig)


In [ ]:

Silk Road Bitcoin Embezzling Visualization


In [2]:
transactions = pd.read_csv('transactions.csv')
transactions['Date'] = pd.to_datetime(transactions['Date'],unit='ms') #coerce date format
transactions[:3]


Out[2]:
Amount $ Date Destination Source Transaction ID isTainted
0 3223.9752 2013-11-23 20:53:20 84a0b53e1ac008b8dd0fd6212d4b7fa2... 2dd13954e18508bb8b3a41d96a022be9... b6eb8ba20df31fa74fbe7755f58c18f82a599d6bb5fa79... 0
1 3708.0216 2014-05-31 01:33:20 3b62a891b99969042d4e6ac8158d0a18... 7c74d3afb41e536e26948a1d2455a7c7... 60df3c67063e136a0c9715edcd12ae717e6f9ed492afe2... 0
2 2.4800 2014-04-27 00:53:20 3b62a891b99969042d4e6ac8158d0a18... 50dced19b8ee41114916bf3ca894f455... a6aafd3d85600844536b8a5f2c255686c33dc4969e68a4... 0

In [3]:
print('DataFrame headers: {}' .format(list(transactions.columns)))


DataFrame headers: ['Amount $', 'Date', 'Destination', 'Source', 'Transaction ID', 'isTainted']

In [4]:
transactions.columns[-1]


Out[4]:
'isTainted'

In [5]:
# 'taint' is weighted as 5
transactions['isTainted'].unique()


Out[5]:
array([0, 5])

In [6]:
# for item in transactions[transactions['isTainted'] == 5].isTainted:
#     item = 10

In [7]:
# for column in transactions.columns[-1]:
#     transactions[transactions == 5] = 10

In [8]:
transactions.shape


Out[8]:
(45117, 6)

In [9]:
transactions.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45117 entries, 0 to 45116
Data columns (total 6 columns):
Amount $          45117 non-null float64
Date              45117 non-null datetime64[ns]
Destination       45117 non-null object
Source            45117 non-null object
Transaction ID    45117 non-null object
isTainted         45117 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 2.1+ MB

In [10]:
# transaction window
print(transactions['Date'].sort_values().head(1), '\n')
print(transactions['Date'].sort_values().tail(1))


23469   2013-09-01 01:46:40
Name: Date, dtype: datetime64[ns] 

2403   2014-08-22 15:06:40
Name: Date, dtype: datetime64[ns]

Visualization 1: Quick Visualization & Analysis

Task: Spot the embezzling

  1. Use the histogram tool to filter for only tainted transactions
  2. Turn on the Setting "Prune Isolated Nodes" to hide wallets with no remaining transactions
  3. Use the filters or excludes tool to only show transactions over 1000 or 1000.
  4. Verify that money flowed from Ross Ulbricht to Carl Force, and explore where else it flowed.

In [11]:
g = graphistry.edges(transactions).bind(source='Source', destination='Destination')

In [12]:
g.plot()


/Users/eastblue/anaconda/lib/python3.6/site-packages/graphistry/pygraphistry.py:466: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access NaTType as type(pandas.NaT)
  elif isinstance(obj, pandas.tslib.NaTType):
Out[12]:

Visualization 2: Summarizing Wallets


In [13]:
# Compute how much wallets received in new df 'wallet_in'
wallet_in = transactions\
.groupby('Destination')\
.agg({'isTainted': lambda x: 1 if x.sum() > 0 else 0, 'Amount $': np.sum})\
.reset_index().rename(columns={'Destination': 'wallet', 'isTainted': 'isTaintedWallet'})
# rename destination to wallet
# rename isTainted to isTaintedWallet

#not all wallets received money, tag these
wallet_in['Receivables'] = True

wallet_in[:3]


Out[13]:
wallet isTaintedWallet Amount $ Receivables
0 0002b3efbc3e742ee4cfaad18d8cf221... 0 41118.416840 True
1 0005e0fbac078e609bbc3239d3302ff7... 1 5577.768000 True
2 000b3df00e3ff9b7705452071c9e4e87... 0 11161.133824 True

In [14]:
wallet_in['isTaintedWallet'].unique()


Out[14]:
array([0, 1])

In [15]:
# Compute how much wallets sent in new df 'wallet_out'
wallet_out = transactions\
  .groupby('Source')\
  .agg({'isTainted': np.sum, 'Amount $': np.max})\
  .reset_index().rename(columns={'Source': 'wallet', 'isTainted': 'isTaintedWallet'})
# rename source to wallet
# rename isTainted to isTaintedWallet

#not all wallets received money, tag these
wallet_out['Payables'] = True

wallet_out[:3]


Out[15]:
wallet isTaintedWallet Amount $ Payables
0 0005e0fbac078e609bbc3239d3302ff7... 0 6197.520000 True
1 000b3df00e3ff9b7705452071c9e4e87... 0 857.923098 True
2 0012742095ed1c2ceb334b2a5403da7d... 0 3472.000000 True

In [16]:
wallet_out['isTaintedWallet'].unique()


Out[16]:
array([ 0,  5, 10, 20, 15, 25, 35, 30])

In [17]:
# Join Data
wallets = pd.merge(wallet_in, wallet_out, how='outer')
wallets['Receivables'] = wallets['Receivables'].fillna(False)
wallets['Payables'] = wallets['Payables'].fillna(False)
print('# Wallets only sent or only received', len(wallet_in) + len(wallet_out) - len(wallets))
wallets[:3]


# Wallets only sent or only received 875
Out[17]:
wallet isTaintedWallet Amount $ Receivables Payables
0 0002b3efbc3e742ee4cfaad18d8cf221... 0 41118.416840 True False
1 0005e0fbac078e609bbc3239d3302ff7... 1 5577.768000 True False
2 000b3df00e3ff9b7705452071c9e4e87... 0 11161.133824 True False

In [18]:
tmp = wallets

In [19]:
# colors at: http://staging.graphistry.com/docs/legacy/api/0.9.2/palette.html#Paired
def convert_to_colors(value):
    if value == 0:
        return 36005 # magenta
    else:
        return 42005 # orange

tmp['isTaintedWallet'] = tmp['isTaintedWallet'].apply(convert_to_colors)

In [20]:
tmp['isTaintedWallet'].unique()


Out[20]:
array([36005, 42005])

Plot

Bind color to whether tainted


In [24]:
g.nodes(tmp).bind(node='wallet', point_color='isTaintedWallet').plot()


Out[24]:

Plain-no-audio.mov


In [ ]: